*-----------------------------------------------------------------
version 10.0
cap clear
cap log close
set more off
cd "E:\REStat_MS14767_Vol96(2)\Data preparation Compustat"
log using "9_match_cq.log", replace
*-----------------------------------------------------------------

*****************************************
* starting data2.dta to master_compustat_all_countries.dta
* This file reconstructs the connection between match.dta and compustat.dta
* so that the data from compustat dataset may later be merged with patent data.
* It is equivalent to the patents_match.do file.
* Match.dta contains 5000 entries.
**************************************************

set mem 600m

use "clean_compustat_master.dta"

keep CUSIP ticker name SIC4 countryinc
rename CUSIP cusip
rename SIC4 sic4
so cusip
drop if cusip==cusip[_n-1]

****************************************
* In order to find matches within match.dta, 9-digit cusip is changed to 6-digit cusip
* and renamed. cusip9 is the original cusip.
************************************

gen cusip6 = substr(cusip, 1, length(cusip) - 3)
rename cusip cusip9
rename cusip6 cusip
so cusip
label var cusip "6-digit cusip from compustat"
label var cusip9 "9-digit cusip from compustat" 

save "patent files\clean_compustat_master_red.dta", replace
clear

***************************
* match.dta has cusip as identifier, contains 5,000 entries compared to 
* 23,000 entries in compustat
******************************

use "patent files\patents\match.dta"
so cusip

merge cusip using "patent files\clean_compustat_master_red.dta"

tab _merge

********************************
*   _merge |      Freq.     Percent        Cum.
*------------+-----------------------------------
*          1 |      1,680        6.15        6.15
*          2 |     22,413       82.03       88.18
*          3 |      3,231       11.82      100.00
*------------+-----------------------------------
*      Total |     27,324      100.00
**************************************

drop if _merge==2
gen cusip_match=0
replace cusip_match=1 if _merge==3
drop _merge

gen cusip_name=name
replace name=cname
rename cusip9 cusip_cusip9
so name

label var assignee "assignee identifier"
label var cname "name in compustat data set"
label var cusip "unique identifier in compustat data set"
label var pname "name of parent firm"
label var sname "name of subsidary firm"
label var assname "patent assignee name "
label var cusip_cusip9 "9-digit cusip number"
label var cusip_name "original compustat name"
label var cusip_match "valid cusip connection between match and compustat"

desc

save "patent files\cusipmerged.dta", replace
clear

*****************************
* This was merging cusip, now comes merging names with match.dta (now cusipmerged.dta)
*******************************

use "patent files\clean_compustat_master_red.dta"
drop cusip
so name
save "patent files\clean_compustat_master_red.dta", replace
clear

use "patent files\cusipmerged.dta"
merge name using "patent files\clean_compustat_master_red.dta"

tab _merge
drop if _merge==2
gen name_match=0
replace name_match=1 if _merge==3
drop if _merge==2
drop _merge
rename cusip9 name_cusip9
count if cusip_match==0 & name_match==0

label var name_match "valid name connection between match and compustat"

save "patent files\cusipnamemerged.dta", replace

count if name_match==1
count if cusip_match==1

****************************
* Now the dataset contains 3003 name-matches and 3231 cusip-matches, 1589 non-matched***
**********************************

insheet using "patent files\manual_matching_cq_ip.csv", clear
save "patent files\manual_matching_cq_ip.dta", replace
drop in 1/1
rename v1 manual_name
rename v2 manual_cusip9
rename v3 cname
rename v4 cusip
rename v5 error
drop cusip manual_cusip
so manual_name
save "patent files\manual_matching_cq.dta", replace

use "patent files\clean_compustat_master_red.dta", clear
keep name cusip
rename name manual_name
rename cusip manual_cusip9
so manual_name
merge manual_name using "patent files\manual_matching_cq.dta"
tab _merge
keep if _merge==3
keep manual_name manual_cusip9 error cname
so cname
save "patent files\manual_matching_cq.dta", replace

use "patent files\cusipnamemerged.dta", clear
so cname
merge cname using "patent files\manual_matching_cq.dta"
gen manual=0
replace manual=1 if _merge==3 & manual_name~="n"
tab _merge
drop _merge

replace cname=manual_name if manual==1
drop manual_name
replace cusip=manual_cusip9 if manual==1
drop manual_cusip
replace cname=cusip_name if cusip_match==1 & name_match==0
drop cusip_name
replace cusip=name_cusip9 if cusip_match==0 & name_match==1
drop name_cusip9
replace cusip=cusip_cusip9 if cusip_match==1
drop cusip_cusip9

**********************************
* The following steps prepare and examine the merge between compustat and cusipnamemerged
* in order to append assignee numbers and names to compustat for later merge with patents
**********************************

so assignee
drop if assignee==assignee[_n-1]
drop if cusip_match==0 & name_match==0 & manual==0

save "patent files\cusipnamemerged.dta", replace

use "patent files\clean_compustat_master_red.dta", clear
rename cusip9 cusip
so cusip
save "patent files\clean_compustat_master_red.dta", replace

use "patent files\cusipnamemerged.dta", clear
so cusip
merge cusip using "patent files\clean_compustat_master_red.dta"
keep if _merge==3
tab _merge
drop _merge

so cusip
count if cusip!=cusip[_n-1]

*************************************************************
* Here it is checked for non-american entries, 
* those are dropped.
******************************************************

count if countryinc!=0 & countryinc!=.
* 64

drop if countryinc!=0 & countryinc!=.

save "patent files\cusipnamemerged.dta", replace

keep assignee cusip ticker sic4
so assignee
desc
save "patent files\cusipnamemerged_red.dta", replace

desc
log close


